***********************************************
* Title: rwanda_jde_table4.do
* Author: Todd Pugatch
* Last update: June 10 2024
*
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
*
* Inputs: 	teacher_merge_jde.dta
*			student_merge_jde.dta
*
* Outputs: 	rwanda_jde_table4.[txt/out]
* Notes: produces Table 4
************************************************

* Set environment
local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"
	
log using "$temp/rwanda_jde_table4.txt", text replace

****************************
* SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
****************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* ACTIVE INSTRUCTION TIME: teacher observation data
/*Goal: produce columns 1-6 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction time on treatment status, controlling
		for randomization strata.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).
	Note that baseline data (BTQ300-302) only available for subset of teachers.*/
qui use "$cleandata/teacher_merge_jde.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* prep missing values for teachers without baseline outcome
qui su pedagogy_active_index_bl if treatment==0 & insample_bl==1
qui replace pedagogy_active_index_bl=r(mean) if insample_bl!=1
qui gen pedagogy_active_index_blm=(insample_bl!=1)
lab var pedagogy_active_index_blm "missing value for `y0'"

* regressions: Table P3, columns 1-6
local Y "mainact_teacher_active_pct	mainact_teacher_active_0_27	mainact_teacher_active_28_52"
local Y0 "pedagogy_active_index_bl pedagogy_active_index_blm"

/*all observed classes: cols 1-3*/
local i=1
foreach y in `Y' {
	qui areg `y' treatment `Y0', a(strata) robust
	scalar define H2_c`i'b=_b[treatment]
	scalar define H2_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H2_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

/*Skills Lab only: cols 4-6*/
foreach y in `Y' {
	qui areg `y' treatment `Y0' if skillslab_class_el==1, a(strata) robust
	scalar define H2_c`i'b=_b[treatment]
	scalar define H2_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H2_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

/*how many Skills Labs in treatment v. control?*/
tab skillslab_class_el treatment, mi

* ACTIVE INSTRUCTION TECHNIQUES: teacher observation and student endline data
/*Goal: produce columns 7-9 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction techniques on treatment status, controlling
		for randomization strata.
	Note that baseline data not applicable here.
	For data using student outcomes, cluster outcomes by school.*/

qui use "$cleandata/teacher_merge_jde.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* regressions: Table P3, columns 7-8 (teacher observation data)
/*column 7: all classes*/
qui areg method_curric_pct_el treatment, a(strata) robust
scalar define H2_c`i'b=_b[treatment]
scalar define H2_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H2_c`i'p=2*ttail(e(df_r),abs(t))
local i=`i'+1

/*column 8: Skills Labs only*/
qui areg method_curric_pct_el treatment if skillslab_class_el==1, a(strata) robust
scalar define H2_c`i'b=_b[treatment]
scalar define H2_c`i'se=_se[treatment]
scalar define t=_b[treatment]/_se[treatment]
scalar define H2_c`i'p=2*ttail(e(df_r),abs(t))
local i=`i'+1

* regressions: Table P3, column 9 (student endline)
/*Note that the dependent variable was defined inconsistently in two places of the Stage 1 Registered 
	Report. Therefore, include both "narrow" measure (consistent with table of hypotheses 
	[Table 1, ESQ600-603]) and "broad" measure (consistent with Table P3, ESQ600-611).*/
qui use "$cleandata/student_merge_jde.dta", clear
local Y "active_instruct_narrow_el active_instruct_broad_el"

foreach y in `Y' {
	qui areg `y' treatment, a(strata) cluster(school_code)
	scalar define H2_c`i'b=_b[treatment]
	scalar define H2_c`i'se=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define H2_c`i'p=2*ttail(e(df_r),abs(t))
	local i=`i'+1	
}

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local n=`i'-1
qui set obs `n'
qui gen hypothesis=2
qui gen column=_n
foreach x in b se pval {
	qui gen double `x'=.
}
forval i=1/`n' {
	qui replace b=H2_c`i'b in `i'
	qui replace se=H2_c`i'se in `i'
	qui replace pval=H2_c`i'p in `i' 
}

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001. */
	
while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************
* list results
list hypothesis column b se pval bky06_qval

* prep results for inclusion in final regression output
forval i=1/`n' {
	scalar define H2_c`i'q=bky06_qval in `i'
}


* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES
* ACTIVE INSTRUCTION TIME: teacher observation data
/*Goal: produce columns 1-6 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction time on treatment status, controlling
		for randomization strata.
	For Lee bounds, proceed in 3 steps:
		1) regress outcome on baseline and strata indicators
		2) get residuals
		3) calculate Lee bounds on these residuals
	Calculate Lee bounds first, then regression. This allows for exporting bounds
		into regression results. Note that this method does not always produce
		bounds containing the original coefficient, because treatment effect 
		from residualized regression not exactly equal to original treatment
		effect (because of potentially spurious correlation between treatment
		and covariates).
	Include sharpened q-values calculated above.
	Note that baseline data (BTQ300-302) only available for subset of teachers.*/
qui use "$cleandata/teacher_merge_jde.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* prep missing values for teachers without baseline outcome
qui su pedagogy_active_index_bl if treatment==0 & insample_bl==1
qui replace pedagogy_active_index_bl=r(mean) if insample_bl!=1
qui gen pedagogy_active_index_blm=(insample_bl!=1)
lab var pedagogy_active_index_blm "missing value for `y0'"

* define program for obtaining Lee bounds
program getleebounds
	qui predict e if e(sample), residuals
	qui leebounds e treatment, select(insample_el)
	scalar define lb=_b[lower]
	scalar define ub=_b[upper]
	drop e
end

* regressions: Table P3, columns 1-6
local stat ""control mean",mu_c,"baseline mean",mu_0,"q-value",qv,"lower bound",lb,"upper bound",ub"
local Y "mainact_teacher_active_pct	mainact_teacher_active_0_27	mainact_teacher_active_28_52"
local Y0 "pedagogy_active_index_bl pedagogy_active_index_blm"
local i=1

/*all observed classes: cols 1-3*/
foreach y in `Y' {
	qui areg `y' `Y0', a(strata) robust
	getleebounds
	
	qui areg `y' treatment `Y0', a(strata) robust
	qui su `y' if treatment==0
	scalar define mu_c=r(mean)
	qui su pedagogy_active_index_bl if insample_bl==1
	scalar define mu_0=r(mean)
	scalar define qv=H2_c`i'q
	outreg2 using "$results/rwanda_jde_table4.xls", se excel nolabel nocons addstat(`stat') replace
	local i=`i'+1
}

/*Skills Lab only: cols 4-6*/
foreach y in `Y' {
	qui areg `y' `Y0' if skillslab_class_el==1, a(strata) robust
	getleebounds
	
	qui areg `y' treatment `Y0' if skillslab_class_el==1, a(strata) robust
	qui su `y' if treatment==0 & skillslab_class_el==1
	scalar define mu_c=r(mean)
	qui su pedagogy_active_index_bl if insample_bl==1 & skillslab_class_el==1
	scalar define mu_0=r(mean)
	scalar define qv=H2_c`i'q
	outreg2 using "$results/rwanda_jde_table4.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

/*how many Skills Labs in treatment v. control?*/
tab skillslab_class_el treatment, mi

* ACTIVE INSTRUCTION TECHNIQUES: teacher observation and student endline data
/*Goal: produce columns 7-9 of Table P3
	Uses endline teacher classroom observations (Stallings tool).
	Regress active instruction techniques on treatment status, controlling
		for randomization strata.
	Include sharpened q-values calculated above.
	Note that baseline data not applicable here.
	For data using student outcomes, cluster outcomes by school.*/
qui use "$cleandata/teacher_merge_jde.dta", clear

* restrict to teachers with classes observed at endline
qui keep if insample_obs_el==1

* regressions: Table P3, columns 7-8 (teacher observation data)
local stat ""control mean",mu_c,"q-value",qv,"lower bound",lb,"upper bound",ub"

/*column 7: all classes*/
qui areg method_curric_pct_el, a(strata) robust
getleebounds

qui areg method_curric_pct_el treatment, a(strata) robust
qui su method_curric_pct_el if treatment==0
scalar define mu_c=r(mean)
scalar define qv=H2_c`i'q
outreg2 using "$results/rwanda_jde_table4.xls", se excel nolabel nocons addstat(`stat') append
local i=`i'+1

/*column 8: Skills Labs only*/
qui areg method_curric_pct_el if skillslab_class_el==1, a(strata) robust
getleebounds

qui areg method_curric_pct_el treatment if skillslab_class_el==1, a(strata) robust
qui su method_curric_pct_el if treatment==0
scalar define mu_c=r(mean)
scalar define qv=H2_c`i'q
outreg2 using "$results/rwanda_jde_table4.xls", se excel nolabel nocons addstat(`stat') append
local i=`i'+1

* regressions: Table P3, column 9 (student endline)
/*Note that the dependent variable was defined inconsistently in two places of the Stage 1 Registered 
	Report. Therefore, include both "narrow" measure (consistent with table of hypotheses 
	[Table 1, ESQ600-603]) and "broad" measure (consistent with Table P3, ESQ600-611).*/
qui use "$cleandata/student_merge_jde.dta", clear
local Y "active_instruct_narrow_el active_instruct_broad_el"

foreach y in `Y' {
	qui areg `y', a(strata) cluster(school_code)
	getleebounds
	
	qui areg `y' treatment, a(strata) cluster(school_code)
	qui su `y' if treatment==0
	scalar define mu_c=r(mean)
	scalar define qv=H2_c`i'q /*outputting 1, no idea why*/
	outreg2 using "$results/rwanda_jde_table4.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close
